/* 
 * Copyright (C) 2014 Bartosz Stankiewicz.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301  USA
 */
package pl.poznan.put.et.kstio.invoice_ocr;

import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.PostConstruct;
import javax.ejb.EJB;
import javax.ejb.Singleton;
import javax.imageio.IIOImage;

import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.util.ImageIOHelper;

import org.apache.log4j.Logger;

import pl.poznan.put.et.kstio.invoice_ocr.model.InvoiceOcrProfile;

/**
 *
 * @author BartaZ
 */
@Singleton
public class TessInstanceBean implements TessInstance {

	private static final Logger logger = Logger.getLogger(TessInstanceBean.class);
	private Tesseract instance;
	private final Pattern nipPattern = Pattern.compile("[Nn][ ]*[Iilrt][ ]*[Pp][ ]*[,.:;=]?[ ]*[A-Za-z]?[ ]*[A-Za-z]?[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}");
	private final Pattern nipNumberPattern = Pattern.compile("[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*-?[ ]*[TIl1234567890oOg]{1}[ ]*[TIl1234567890oOg]{1}");

	@EJB
	private GlobalProperies globalProperies;

	@PostConstruct
	private void init() {
		instance = Tesseract.getInstance();
		instance.setDatapath(globalProperies.getTessDataPrefix());
		instance.setLanguage(globalProperies.getTessDataLanguage());
	}

	@Override
	public Tesseract getInstance() {
		return instance;
	}

	@Override
	public String doPdfOCR(File pdfFile) {
		try {
			List<IIOImage> pdfImageList = ImageIOHelper.getIIOImageList(pdfFile);
			logger.debug("doOCR width: " + pdfImageList.get(0).getRenderedImage().getWidth());
			logger.debug("doOCR Height: " + pdfImageList.get(0).getRenderedImage().getHeight());
			return instance.doOCR(pdfImageList, null);
		} catch (TesseractException ex) {
			logger.error("doOCR TesseractException: " + ex.getMessage());
		} catch (IOException ex) {
			logger.error("doOCR IOException: " + ex.getMessage());
		}
		return null;
	}

	@Override
	public String[] getNipFromPdf(String invoiceText) {
		Set<String> result = new HashSet<String>();
		if (invoiceText != null && !invoiceText.isEmpty()) {
			Matcher nipAndNumberMatcher = nipPattern.matcher(invoiceText);
			while(nipAndNumberMatcher.find()) {
				String nipAndNumber = invoiceText.substring(nipAndNumberMatcher.start(), nipAndNumberMatcher.end());
				Matcher numberMatcher = nipNumberPattern.matcher(nipAndNumber);
				numberMatcher.find();
				String number = nipAndNumber.substring(numberMatcher.start(), numberMatcher.end());
				number = number.replaceAll("[ -]", "");
				number = number.replaceAll("[oO]", "0");
				number = number.replaceAll("[g]", "9");
				number = number.replaceAll("[T]", "7");
				number = number.replaceAll("[lI]", "1");
				result.add(number);
			}
		}
		return result.toArray(new String[0]);
	}

	@Override
	public Map<String, Object> getInvoiceProperty(File pdfFile, List<InvoiceOcrProfile> propertyList) {
		List<IIOImage> pdfImageList;
		List<IIOImage> firstPagePdfImageList;
		Map<String, Object> result = new HashMap<String, Object>();
		try {
			pdfImageList = ImageIOHelper.getIIOImageList(pdfFile);
			if(pdfImageList.size() > 0) {
				firstPagePdfImageList = new ArrayList<IIOImage>();
				IIOImage pdfImage = pdfImageList.get(0);
				int pageWidth = pdfImage.getRenderedImage().getWidth();
				int pageHeight = pdfImage.getRenderedImage().getHeight();
				logger.debug("pageWidth: " + pageWidth + ", pageHeight: " + pageHeight);
				firstPagePdfImageList.add(pdfImage);
				for (InvoiceOcrProfile property : propertyList) {
					int xProp = Math.round(property.getX() * pageWidth);
					int yProp = Math.round(property.getY() * pageHeight);
					int widthProp = Math.round(property.getWidth() * pageWidth);
					int heightProp = Math.round(property.getHeight() * pageHeight);
					logger.debug(xProp + "," + yProp + "," + widthProp + "," + heightProp);
					String value = instance.doOCR(firstPagePdfImageList, new Rectangle(xProp, yProp, widthProp, heightProp));
					if(value != null && !value.isEmpty()) {
						result.put(property.getProfilePk().getFieldName(), value);
					}
				}
			}
		} catch (IOException | TesseractException e) {
			logger.error("cos nie tak!", e);
		}

		return result;
	}
	
	
}
